{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "\n",
    "# offical_names = ['MinerU-0.9.3', 'Marker-0.2.17', 'Marker-1.2.3', 'Mathpix', 'docling', 'GOT-OCR', 'Nougat', 'GPT4o', 'Qwen2-VL-72B', 'InternVL2-Llama3-76B']\n",
    "# ocr_types = ['mineru1110_docyolo', 'marker', 'marker0106', 'mathpix1108', 'docling', 'GOT', 'nougat', 'gpt4o_long_prompt', 'QwenVL_long_prompt', 'internvl_long_prompt']\n",
    "ocr_types = ['end2end']\n",
    "offical_names = ocr_types\n",
    "\n",
    "result_folder = '../result'\n",
    "\n",
    "# match_name = 'no_split'\n",
    "match_name = 'quick_match'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_block_Edit_dist_EN</th>\n",
       "      <th>text_block_Edit_dist_CH</th>\n",
       "      <th>display_formula_Edit_dist_EN</th>\n",
       "      <th>display_formula_Edit_dist_CH</th>\n",
       "      <th>display_formula_CDM_EN</th>\n",
       "      <th>display_formula_CDM_CH</th>\n",
       "      <th>table_TEDS_EN</th>\n",
       "      <th>table_TEDS_CH</th>\n",
       "      <th>table_Edit_dist_EN</th>\n",
       "      <th>table_Edit_dist_CH</th>\n",
       "      <th>reading_order_Edit_dist_EN</th>\n",
       "      <th>reading_order_Edit_dist_CH</th>\n",
       "      <th>overall_EN</th>\n",
       "      <th>overall_CH</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>end2end</th>\n",
       "      <td>0.073</td>\n",
       "      <td>0.556</td>\n",
       "      <td>0.493</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>86.131</td>\n",
       "      <td>74.884</td>\n",
       "      <td>0.128</td>\n",
       "      <td>0.179</td>\n",
       "      <td>0.082</td>\n",
       "      <td>0.326</td>\n",
       "      <td>0.194</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         text_block_Edit_dist_EN  text_block_Edit_dist_CH  \\\n",
       "end2end                    0.073                    0.556   \n",
       "\n",
       "         display_formula_Edit_dist_EN  display_formula_Edit_dist_CH  \\\n",
       "end2end                         0.493                           NaN   \n",
       "\n",
       "        display_formula_CDM_EN display_formula_CDM_CH  table_TEDS_EN  \\\n",
       "end2end                      -                      -         86.131   \n",
       "\n",
       "         table_TEDS_CH  table_Edit_dist_EN  table_Edit_dist_CH  \\\n",
       "end2end         74.884               0.128               0.179   \n",
       "\n",
       "         reading_order_Edit_dist_EN  reading_order_Edit_dist_CH  overall_EN  \\\n",
       "end2end                       0.082                       0.326       0.194   \n",
       "\n",
       "         overall_CH  \n",
       "end2end         NaN  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# overall result\n",
    "\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    save_dict = {}\n",
    "    en_overall = []\n",
    "    ch_overall = []\n",
    "    for category_type, metric in [(\"text_block\", \"Edit_dist\"), (\"display_formula\", \"Edit_dist\"), (\"display_formula\", \"CDM\"), (\"table\", \"TEDS\"), (\"table\", \"Edit_dist\"), (\"reading_order\", \"Edit_dist\")]:\n",
    "        if metric == 'CDM':\n",
    "            save_dict[category_type+'_'+metric+'_EN'] = '-'\n",
    "            save_dict[category_type+'_'+metric+'_CH'] = '-'\n",
    "        elif metric == \"TEDS\":\n",
    "            save_dict[category_type+'_'+metric+'_EN'] = result[category_type][\"page\"][metric][\"language: english\"] * 100\n",
    "            save_dict[category_type+'_'+metric+'_CH'] = result[category_type][\"page\"][metric][\"language: simplified_chinese\"] * 100\n",
    "            # save_dict[category_type+'_'+metric+'_EN'] = '-'\n",
    "            # save_dict[category_type+'_'+metric+'_CH'] = '-'\n",
    "        else:\n",
    "            save_dict[category_type+'_'+metric+'_EN'] = result[category_type][\"page\"][metric].get(\"language: english\", np.NaN)\n",
    "            save_dict[category_type+'_'+metric+'_CH'] = result[category_type][\"page\"][metric].get(\"language: simplified_chinese\",np.NaN)\n",
    "        if metric == \"Edit_dist\":\n",
    "            en_overall.append(result[category_type][\"page\"][metric].get(\"language: english\", np.NaN))\n",
    "            ch_overall.append(result[category_type][\"page\"][metric].get(\"language: simplified_chinese\",np.NaN))\n",
    "    \n",
    "    save_dict['overall_EN'] = sum(en_overall) / len(en_overall)\n",
    "    save_dict['overall_CH'] = sum(ch_overall) / len(ch_overall)\n",
    "    dict_list.append(save_dict)\n",
    "    \n",
    "df = pd.DataFrame(dict_list, index=offical_names).round(3)\n",
    "# df.to_csv('./overall.csv')\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/petrelfs/ouyanglinke/.local/lib/python3.7/site-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data_source: book</th>\n",
       "      <th>data_source: PPT2PDF</th>\n",
       "      <th>data_source: research_report</th>\n",
       "      <th>data_source: colorful_textbook</th>\n",
       "      <th>data_source: exam_paper</th>\n",
       "      <th>data_source: magazine</th>\n",
       "      <th>data_source: academic_literature</th>\n",
       "      <th>data_source: note</th>\n",
       "      <th>data_source: newspaper</th>\n",
       "      <th>mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>end2end</th>\n",
       "      <td>0.43</td>\n",
       "      <td>0.004</td>\n",
       "      <td>0.613</td>\n",
       "      <td>0.49</td>\n",
       "      <td>0.039</td>\n",
       "      <td>0.362</td>\n",
       "      <td>0.123</td>\n",
       "      <td>0.582</td>\n",
       "      <td>0.562</td>\n",
       "      <td>0.356</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         data_source: book  data_source: PPT2PDF  \\\n",
       "end2end               0.43                 0.004   \n",
       "\n",
       "         data_source: research_report  data_source: colorful_textbook  \\\n",
       "end2end                         0.613                            0.49   \n",
       "\n",
       "         data_source: exam_paper  data_source: magazine  \\\n",
       "end2end                    0.039                  0.362   \n",
       "\n",
       "         data_source: academic_literature  data_source: note  \\\n",
       "end2end                             0.123              0.582   \n",
       "\n",
       "         data_source: newspaper   mean  \n",
       "end2end                   0.562  0.356  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# PDF type\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['text_block'][\"page\"][\"Edit_dist\"])\n",
    "df2 = pd.DataFrame(dict_list, index=offical_names)\n",
    "\n",
    "reordered_df2 = df2.round(3)\n",
    "\n",
    "selected_columns = reordered_df2[[\"data_source: book\", \"data_source: PPT2PDF\", \"data_source: research_report\", \"data_source: colorful_textbook\", \"data_source: exam_paper\", \"data_source: magazine\", \"data_source: academic_literature\", \"data_source: note\", \"data_source: newspaper\"]]\n",
    "# calculate mean\n",
    "selected_columns['mean'] = reordered_df2[\"ALL\"]\n",
    "# selected_columns['variance'] = selected_columns.var(axis=1)\n",
    "# selected_columns.to_csv('./data_source.csv')\n",
    "\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# page special issue\n",
    "\n",
    "def get_columns(df, required_columns):\n",
    "    for col in required_columns:\n",
    "        if col not in df.columns:\n",
    "            df[col] = float('nan')\n",
    "\n",
    "    # Then select the columns\n",
    "    selected_columns = df[required_columns]\n",
    "    return selected_columns\n",
    "\n",
    "dict_list = []\n",
    "dict_list_var = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['text_block'][\"page\"][\"Edit_dist\"])\n",
    "    dict_list_var.append(result['text_block'][\"page\"][\"Edit_dist_var\"])\n",
    "\n",
    "df2 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df2_var = pd.DataFrame(dict_list_var, index=offical_names)\n",
    "reordered_df2 = df2.round(3)\n",
    "reordered_df2_var = df2_var.round(3)\n",
    "\n",
    "# Ensure all required columns exist, adding them with NaN values if they don't\n",
    "required_columns = [\"layout: single_column\", \"layout: double_column\", \"layout: three_column\", \"layout: other_layout\"]\n",
    "\n",
    "# Then select the columns\n",
    "selected_columns = get_columns(reordered_df3, required_columns)\n",
    "selected_columns_var = get_columns(reordered_df3_var, required_columns)\n",
    "\n",
    "selected_columns.rename(columns={'fuzzy_scan':'fuzzy_scan_mean','watermark':'watermark_mean','colorful_backgroud':'colorful_backgroud_mean'},inplace=True)\n",
    "selected_columns_var.rename(columns={'fuzzy_scan':'fuzzy_scan_var','watermark':'watermark_var','colorful_backgroud':'colorful_backgroud_var'},inplace=True)\n",
    "result = pd.merge(selected_columns, selected_columns_var, left_index=True, right_index=True)\n",
    "result = result.reindex(columns=['fuzzy_scan_mean','fuzzy_scan_var','watermark_mean','watermark_var','colorful_backgroud_mean','colorful_backgroud_var'])\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/mnt/petrelfs/ouyanglinke/anaconda3/envs/layoutlmv3/lib/python3.7/site-packages/pandas/core/frame.py:5047: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  errors=errors,\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>layout: single_column_mean</th>\n",
       "      <th>layout: single_column_var</th>\n",
       "      <th>layout: double_column_mean</th>\n",
       "      <th>layout: double_column_var</th>\n",
       "      <th>layout: three_column_mean</th>\n",
       "      <th>layout: three_column_var</th>\n",
       "      <th>layout: other_layout_mean</th>\n",
       "      <th>layout: other_layout_var</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>end2end</th>\n",
       "      <td>0.084</td>\n",
       "      <td>0.095</td>\n",
       "      <td>0.299</td>\n",
       "      <td>0.137</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.134</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         layout: single_column_mean  layout: single_column_var  \\\n",
       "end2end                       0.084                      0.095   \n",
       "\n",
       "         layout: double_column_mean  layout: double_column_var  \\\n",
       "end2end                       0.299                      0.137   \n",
       "\n",
       "         layout: three_column_mean  layout: three_column_var  \\\n",
       "end2end                        0.0                       NaN   \n",
       "\n",
       "         layout: other_layout_mean  layout: other_layout_var  \n",
       "end2end                        0.6                     0.134  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading order under different layout\n",
    "dict_list = []\n",
    "dict_list_var = []\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['reading_order'][\"page\"][\"Edit_dist\"])\n",
    "    dict_list_var.append(result['text_block'][\"page\"][\"Edit_dist_var\"])\n",
    "df3 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df3_var = pd.DataFrame(dict_list_var, index=offical_names)\n",
    "reordered_df3 = df3.round(3)\n",
    "reordered_df3_var = df3_var.round(3)\n",
    "\n",
    "\n",
    "selected_columns3 = reordered_df3[[\"layout: single_column\", \"layout: double_column\", \"layout: three_column\", \"layout: other_layout\"]]\n",
    "selected_columns3_var = reordered_df3_var[[\"layout: single_column\", \"layout: double_column\", \"layout: three_column\", \"layout: other_layout\"]]\n",
    "# calculate mean and variance\n",
    "selected_columns3.rename(columns={'layout: single_column':'layout: single_column_mean','layout: double_column':'layout: double_column_mean','layout: three_column':'layout: three_column_mean','layout: other_layout':'layout: other_layout_mean'},inplace=True)\n",
    "selected_columns3_var.rename(columns={'layout: single_column':'layout: single_column_var','layout: double_column':'layout: double_column_var','layout: three_column':'layout: three_column_var','layout: other_layout':'layout: other_layout_var'},inplace=True)\n",
    "result = pd.merge(selected_columns3, selected_columns3_var, left_index=True, right_index=True)\n",
    "result = result.reindex(columns=['layout: single_column_mean','layout: single_column_var','layout: double_column_mean','layout: double_column_var','layout: three_column_mean','layout: three_column_var','layout: other_layout_mean','layout: other_layout_var'])\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_language: text_english</th>\n",
       "      <th>text_language: text_simplified_chinese</th>\n",
       "      <th>text_language: text_en_ch_mixed</th>\n",
       "      <th>text_background: white</th>\n",
       "      <th>text_background: single_colored</th>\n",
       "      <th>text_background: multi_colored</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>end2end</th>\n",
       "      <td>0.093</td>\n",
       "      <td>0.735</td>\n",
       "      <td>0.318</td>\n",
       "      <td>0.531</td>\n",
       "      <td>0.341</td>\n",
       "      <td>0.36</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         text_language: text_english  text_language: text_simplified_chinese  \\\n",
       "end2end                        0.093                                   0.735   \n",
       "\n",
       "         text_language: text_en_ch_mixed  text_background: white  \\\n",
       "end2end                            0.318                   0.531   \n",
       "\n",
       "         text_background: single_colored  text_background: multi_colored  \n",
       "end2end                            0.341                            0.36  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# text attribute\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['text_block'][\"group\"][\"Edit_dist\"])\n",
    "\n",
    "df4 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df4 = df4.round(3)\n",
    "\n",
    "selected_columns = df4[[\"text_language: text_english\", \"text_language: text_simplified_chinese\", \"text_language: text_en_ch_mixed\", \"text_background: white\", \"text_background: single_colored\", \"text_background: multi_colored\"]]\n",
    "\n",
    "# selected_columns.to_csv('.text_attribute.csv')\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# table attribute\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_{match_name}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['table'][\"group\"][\"TEDS\"])\n",
    "\n",
    "df4 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df4 = df4 * 100\n",
    "df4 = df4.round(1)\n",
    "\n",
    "selected_columns = df4[[\"language: table_en\", \"language: table_simplified_chinese\", \"language: table_en_ch_mixed\", \"line: full_line\", \"line: less_line\", \"line: fewer_line\", \"line: wireless_line\", \n",
    "                        \"with_span: True\", \"with_span: False\", \"include_equation: True\", \"include_equation: False\", \"include_background: True\", \"include_background: False\", \"table_layout: vertical\", \"table_layout: horizontal\"]]\n",
    "\n",
    "# selected_columns.to_csv('./table_attribute.csv')\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text_language: text_english</th>\n",
       "      <th>text_language: text_simplified_chinese</th>\n",
       "      <th>text_language: text_en_ch_mixed</th>\n",
       "      <th>text_background: white</th>\n",
       "      <th>text_background: single_colored</th>\n",
       "      <th>text_background: multi_colored</th>\n",
       "      <th>text_rotate: normal</th>\n",
       "      <th>text_rotate: rotate90</th>\n",
       "      <th>text_rotate: rotate270</th>\n",
       "      <th>text_rotate: horizontal</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>OmniDocBench_easyocr_text_ocr</th>\n",
       "      <td>0.26</td>\n",
       "      <td>0.398</td>\n",
       "      <td>0.445</td>\n",
       "      <td>0.366</td>\n",
       "      <td>0.287</td>\n",
       "      <td>0.388</td>\n",
       "      <td>0.360</td>\n",
       "      <td>0.970</td>\n",
       "      <td>0.997</td>\n",
       "      <td>0.926</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>OmniDocBench_openocr_text_ocr</th>\n",
       "      <td>0.07</td>\n",
       "      <td>0.068</td>\n",
       "      <td>0.106</td>\n",
       "      <td>0.069</td>\n",
       "      <td>0.058</td>\n",
       "      <td>0.081</td>\n",
       "      <td>0.069</td>\n",
       "      <td>0.038</td>\n",
       "      <td>0.891</td>\n",
       "      <td>0.025</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               text_language: text_english  \\\n",
       "OmniDocBench_easyocr_text_ocr                         0.26   \n",
       "OmniDocBench_openocr_text_ocr                         0.07   \n",
       "\n",
       "                               text_language: text_simplified_chinese  \\\n",
       "OmniDocBench_easyocr_text_ocr                                   0.398   \n",
       "OmniDocBench_openocr_text_ocr                                   0.068   \n",
       "\n",
       "                               text_language: text_en_ch_mixed  \\\n",
       "OmniDocBench_easyocr_text_ocr                            0.445   \n",
       "OmniDocBench_openocr_text_ocr                            0.106   \n",
       "\n",
       "                               text_background: white  \\\n",
       "OmniDocBench_easyocr_text_ocr                   0.366   \n",
       "OmniDocBench_openocr_text_ocr                   0.069   \n",
       "\n",
       "                               text_background: single_colored  \\\n",
       "OmniDocBench_easyocr_text_ocr                            0.287   \n",
       "OmniDocBench_openocr_text_ocr                            0.058   \n",
       "\n",
       "                               text_background: multi_colored  \\\n",
       "OmniDocBench_easyocr_text_ocr                           0.388   \n",
       "OmniDocBench_openocr_text_ocr                           0.081   \n",
       "\n",
       "                               text_rotate: normal  text_rotate: rotate90  \\\n",
       "OmniDocBench_easyocr_text_ocr                0.360                  0.970   \n",
       "OmniDocBench_openocr_text_ocr                0.069                  0.038   \n",
       "\n",
       "                               text_rotate: rotate270  text_rotate: horizontal  \n",
       "OmniDocBench_easyocr_text_ocr                   0.997                    0.926  \n",
       "OmniDocBench_openocr_text_ocr                   0.891                    0.025  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# text Recognition\n",
    "\n",
    "ocr_types = ['OmniDocBench_easyocr_text_ocr', 'OmniDocBench_openocr_text_ocr']\n",
    "offical_names = ocr_types\n",
    "\n",
    "result_folder = '../result'\n",
    "\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result[\"group\"][\"Edit_dist\"])\n",
    "\n",
    "df4 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df4 = df4.round(3)\n",
    "\n",
    "selected_columns = df4[[\"text_language: text_english\", \"text_language: text_simplified_chinese\", \"text_language: text_en_ch_mixed\", \"text_background: white\", \"text_background: single_colored\", \"text_background: multi_colored\", \"text_rotate: normal\", \"text_rotate: rotate90\", \"text_rotate: rotate270\", \"text_rotate: horizontal\"]]\n",
    "\n",
    "# selected_columns.to_csv('.text_attribute.csv')\n",
    "selected_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# table Recognition\n",
    "\n",
    "ocr_types = ['OmniDocBench_rapidtable_ocr']\n",
    "offical_names = ocr_types\n",
    "\n",
    "result_folder = '../result'\n",
    "\n",
    "dict_list = []\n",
    "\n",
    "for ocr_type in ocr_types:\n",
    "    result_path = os.path.join(result_folder, f'{ocr_type}_metric_result.json')\n",
    "    \n",
    "    with open(result_path, 'r') as f:\n",
    "        result = json.load(f)\n",
    "    \n",
    "    # for category_type in result.keys():\n",
    "    dict_list.append(result['table'][\"group\"][\"TEDS\"])\n",
    "\n",
    "df4 = pd.DataFrame(dict_list, index=offical_names)\n",
    "df4 = df4 * 100\n",
    "df4 = df4.round(1)\n",
    "\n",
    "selected_columns = df4[[\"language: table_en\", \"language: table_simplified_chinese\", \"language: table_en_ch_mixed\", \"line: full_line\", \"line: less_line\", \"line: fewer_line\", \"line: wireless_line\", \n",
    "                        \"with_span: True\", \"with_span: False\", \"include_equation: True\", \"include_equation: False\", \"include_background: True\", \"include_background: False\", \"table_layout: vertical\", \"table_layout: horizontal\"]]\n",
    "\n",
    "# selected_columns.to_csv('./table_attribute.csv')\n",
    "selected_columns"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "layoutlmv3",
   "language": "python",
   "name": "layoutlmv3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
